Loading the data

data <-read_csv("C:/Users/Sohana/OneDrive - University of Saskatchewan/ML 898/data/can_path_data.csv")

Question 1. Dataset Preparation and EDA

Identify the variable types for each variable in the dataset

#glimpse(data)

Creating a new dataset with selected variables

data <- data |> dplyr::select(ID,
                       SDC_AGE_CALC, 
                       PA_TOTAL_SHORT, 
                       PM_BMI_SR, 
                       SDC_EDU_LEVEL_AGE, 
                       PSE_ADULT_WRK_DURATION, 
                       PM_WAIST_HIP_RATIO_SR,
                       PA_SIT_AVG_TIME_DAY, 
                       SLE_TIME, 
                       NUT_VEG_QTY, 
                       NUT_FRUITS_QTY, 
                       NUT_JUICE_QTY, 
                       ALC_CUR_FREQ)

Preliminary analysis

Summary Statistics

rm_covsum(data=data, 
covs=c('SDC_AGE_CALC','PA_TOTAL_SHORT', 'PM_BMI_SR', 'SDC_EDU_LEVEL_AGE', 'PSE_ADULT_WRK_DURATION', 'PM_WAIST_HIP_RATIO_SR', 'PA_SIT_AVG_TIME_DAY', 'SLE_TIME', 'NUT_VEG_QTY', 'NUT_FRUITS_QTY', 'NUT_JUICE_QTY', 'ALC_CUR_FREQ'))
n=41187
SDC AGE CALC
Mean (sd) 51.5 (10.8)
Median (Min,Max) 52 (30, 74)
PA TOTAL SHORT
Mean (sd) 2574.1 (2656.2)
Median (Min,Max) 1782 (0, 19278)
Missing 6763
PM BMI SR
Mean (sd) 27.5 (6.2)
Median (Min,Max) 26.6 (8.9, 69.4)
Missing 11976
SDC EDU LEVEL AGE
Mean (sd) 25.4 (9.2)
Median (Min,Max) 23 (-7, 73)
Missing 2817
PSE ADULT WRK DURATION
Mean (sd) 6.6 (9.4)
Median (Min,Max) 2 (0, 51)
Missing 5731
PM WAIST HIP RATIO SR
Mean (sd) 0.9 (0.1)
Median (Min,Max) 0.9 (0.3, 2.4)
Missing 17734
PA SIT AVG TIME DAY
Mean (sd) 391.1 (366.9)
Median (Min,Max) 360 (0, 9999)
Missing 11257
SLE TIME
Mean (sd) 435.9 (75.1)
Median (Min,Max) 430 (0, 1390)
Missing 2792
NUT VEG QTY
Mean (sd) 2.7 (1.7)
Median (Min,Max) 2 (0, 35)
Missing 2549
NUT FRUITS QTY
Mean (sd) 2.1 (1.4)
Median (Min,Max) 2 (0, 25)
Missing 2426
NUT JUICE QTY
Mean (sd) 0.8 (1.1)
Median (Min,Max) 1 (0, 44)
Missing 3583
ALC CUR FREQ
Mean (sd) 3.0 (3.3)
Median (Min,Max) 3 (-7, 7)
Missing 1683

Data Cleaning

## Two variables had variables coded as -7. Converting those to missing. 
data <- data %>% mutate(SDC_EDU_LEVEL_AGE = if_else(SDC_EDU_LEVEL_AGE < 0, NA_real_, SDC_EDU_LEVEL_AGE))
data <- data %>% mutate(ALC_CUR_FREQ = if_else(ALC_CUR_FREQ < 0, NA_real_, ALC_CUR_FREQ))

data <- data %>%
          mutate(PA_SIT_AVG_TIME_DAY = case_when(
            PA_SIT_AVG_TIME_DAY > 360 ~ 360,
            TRUE ~ PA_SIT_AVG_TIME_DAY
          ))

### Drop NA
data <- drop_na(data)

Summary Statistics after cleaning

rm_covsum(data=data, 
covs=c('SDC_AGE_CALC','PA_TOTAL_SHORT', 'PM_BMI_SR', 'SDC_EDU_LEVEL_AGE', 'PSE_ADULT_WRK_DURATION', 'PM_WAIST_HIP_RATIO_SR', 'PA_SIT_AVG_TIME_DAY', 'SLE_TIME', 'NUT_VEG_QTY', 'NUT_FRUITS_QTY', 'NUT_JUICE_QTY', 'ALC_CUR_FREQ'))
n=13242
SDC AGE CALC
Mean (sd) 51.8 (10.5)
Median (Min,Max) 52 (30, 74)
PA TOTAL SHORT
Mean (sd) 2757.5 (2658.1)
Median (Min,Max) 2010 (0, 19278)
PM BMI SR
Mean (sd) 27.3 (5.8)
Median (Min,Max) 26.5 (8.9, 69.4)
SDC EDU LEVEL AGE
Mean (sd) 25.3 (8.3)
Median (Min,Max) 23 (1, 73)
PSE ADULT WRK DURATION
Mean (sd) 6.5 (9.3)
Median (Min,Max) 2 (0, 51)
PM WAIST HIP RATIO SR
Mean (sd) 0.9 (0.1)
Median (Min,Max) 0.9 (0.4, 2.4)
PA SIT AVG TIME DAY
Mean (sd) 303.1 (78.0)
Median (Min,Max) 360 (0, 360)
SLE TIME
Mean (sd) 435.7 (69.2)
Median (Min,Max) 430 (0, 960)
NUT VEG QTY
Mean (sd) 2.7 (1.6)
Median (Min,Max) 2 (0, 30)
NUT FRUITS QTY
Mean (sd) 2.1 (1.3)
Median (Min,Max) 2 (0, 21)
NUT JUICE QTY
Mean (sd) 0.8 (1.0)
Median (Min,Max) 1 (0, 15)
ALC CUR FREQ
Mean (sd) 3.7 (2.1)
Median (Min,Max) 4 (0, 7)

Correlations

data %>% 
  correlate() %>%
  rearrange() %>%
  shave()  %>%
  rplot(print_cor=TRUE) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1))

Question 2. Baseline Model Implementation

Recipe for PCA

pca_recipe <- recipe(~., data = data) %>%
  update_role(ID, new_role = "id") %>%
  step_scale(all_predictors()) %>%
  step_center(all_predictors()) %>%
  step_pca(all_predictors(), id = "pca_id")

pca_recipe

Prep the recipe

pca_prepared <- prep(pca_recipe, retain = TRUE)

pca_prepared

Bake the recipe to get the PCA components

pca_baked <- bake(pca_prepared, data)
pca_baked
## # A tibble: 13,242 × 6
##    ID             PC1    PC2     PC3    PC4     PC5
##    <chr>        <dbl>  <dbl>   <dbl>  <dbl>   <dbl>
##  1 SYN_58624   0.746  -1.57   0.418  -0.781  0.118 
##  2 SYN_58628   0.291   0.166 -1.24   -1.18  -0.837 
##  3 SYN_586217  1.02    0.482 -0.381  -0.518 -0.797 
##  4 SYN_586219 -0.170  -2.79  -0.830   0.870 -0.0621
##  5 SYN_586221 -0.144   0.509  0.441  -1.08  -1.91  
##  6 SYN_586230  0.0477 -1.22  -1.16   -0.107  0.0144
##  7 SYN_586246 -1.21    0.275 -0.534  -0.441 -0.222 
##  8 SYN_586247 -1.41    3.44  -1.13    0.478 -1.16  
##  9 SYN_586250 -1.28   -0.973 -1.13    0.145 -0.661 
## 10 SYN_586256 -0.228  -1.83  -0.0119 -0.544 -0.349 
## # ℹ 13,232 more rows

Components loadings

pca_variables <- tidy(pca_prepared, id = "pca_id", type = "coef")

ggplot(pca_variables) +
  geom_point(aes(x = value, y = terms, color = component))+
  labs(color = NULL) +
  geom_vline(xintercept=0) + 
  geom_vline(xintercept=-0.2, linetype = 'dashed') + 
  geom_vline(xintercept=0.2, linetype = 'dashed') + 
  facet_wrap(~ component) +
  theme_minimal()

Variance explained by components

pca_variances <- tidy(pca_prepared, id = "pca_id", type = "variance")

pca_variance <- pca_variances |> filter(terms == "percent variance")
pca_variance$component <- as.factor(pca_variance$component)
pca_variance$comp <- as.numeric(pca_variance$component)

ggplot(pca_variance, aes(x = component, y = value, group = 1, color = component)) +
  geom_point() +
  geom_line() +
  labs(x = "Principal Components", y = "Variance explained (%)") +
  theme_minimal()

Bar Chart for explained variance

pca_variance2 <-ggplot(pca_variance, aes(x = component, y = value)) +
  geom_bar(stat = "identity", fill = "lightgreen") +
  geom_text(aes(label = paste0(round(value, 2), "%")), vjust = -0.5, size = 3) +
  labs(x = "Principal Components", y = "Percentage Explained Variance") +
  theme_minimal()

pca_variance2

Interpretation: The bar chart shows that PC1 explains about 13.37% of the variance, PC2 explains about 12.69% of the variance, PC3 explains about 9.93% of the variance and PC4 explains about 9.23% of the variance. The remaining components explain less than 9% of the variance each.

Cumulative percent variance

pca_cummul_variance <- pca_variances |> filter(terms == "cumulative percent variance")
pca_cummul_variance$component <- as.factor(pca_cummul_variance$component)
pca_cummul_variance$comp <- as.numeric(pca_cummul_variance$component)

ggplot(pca_cummul_variance, aes(x = component, y = value, group = 1, color = component)) +
  geom_point() +
  geom_line() +
  labs(x = "Principal Components", y = "Cummulative Variance explained (%)") +
  theme_minimal()

Interpretation: The cumulative variance plot shows that the first few components explain a large portion of the variance in the data. PC1 to PC4 explain about 50% of the variance and PC1 to PC7 explain about 80% of the variance. This suggests that we can reduce the dimensionality of the data while retaining a significant amount of information by keeping only the first few principal components.

Relationship between components

pca_corr <- pca_baked |> dplyr::select(!(ID))

pca_corr %>% 
  correlate() %>%
  rearrange() %>%
  shave()  %>%
  rplot(print_cor=TRUE)

ggplot(pca_baked, aes(x = PC1, y = PC2)) +
  geom_point(alpha = 0.1) +
  geom_smooth(method = "lm") +
  labs(x = "PC1", y = "PC2") +
  theme_minimal()

Interpretation: The correlation plot shows that the principal components are uncorrelated with each other, which is a key property of PCA. The scatter plot of PC1 vs PC2 shows that there is no clear linear relationship between these two components, which is expected since they are uncorrelated. This suggests that the PCA has successfully transformed the original correlated variables into a new set of uncorrelated components.

scatter_3d <- plot_ly(pca_baked, x = ~PC3, y = ~PC4, z = ~PC5, type = "scatter3d", mode = "markers",
                  marker = list(size = 3)) %>%
                  layout(title = "3D Scatter Plot",
                         scene = list(xaxis = list(title = "PC3"),
                                      yaxis = list(title = "PC4"),
                                      zaxis = list(title = "PC5")))

# Display the 3D scatter plot
scatter_3d

Interpretation: The 3D scatter plot of PC3, PC4, and PC5 shows that there is no clear clustering or grouping of the data points in this reduced dimensional space. This suggests that these components may not capture distinct patterns or clusters in the data, and further analysis may be needed to identify meaningful groupings.

Question 3. Cluster Selection Tuning

Model

set.seed(10)

kmeans_model <- k_means(num_clusters = 4) %>%
  set_engine("stats")

kmeans_model
## K Means Cluster Specification (partition)
## 
## Main Arguments:
##   num_clusters = 4
## 
## Computational engine: stats

Recipe

kmeans_recipe <- recipe( ~., data = data) %>%
  update_role(ID, new_role = "id") %>% 
  step_zv(all_predictors()) %>%
  step_center(all_numeric_predictors()) %>%
  step_scale(all_numeric_predictors())

Workflow

set.seed(10)

kmeans_workflow <- workflow() %>%
    add_recipe(kmeans_recipe) %>%
    add_model(kmeans_model)

Fit the model

set.seed(10)

kmeans_fit <- kmeans_workflow %>% fit(data=data)

kmeans_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: k_means()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_zv()
## • step_center()
## • step_scale()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## K-means clustering with 4 clusters of sizes 4423, 3512, 3301, 2006
## 
## Cluster means:
##   SDC_AGE_CALC PA_TOTAL_SHORT  PM_BMI_SR SDC_EDU_LEVEL_AGE
## 3   -0.7272835     -0.2579418 -0.4554022       -0.19251605
## 4    0.2663898     -0.3231741  0.7748929       -0.01345518
## 1    0.2095763      0.5756346 -0.2792562        0.32719500
## 2    0.7923245      0.1872851  0.1070014       -0.09038765
##   PSE_ADULT_WRK_DURATION PM_WAIST_HIP_RATIO_SR PA_SIT_AVG_TIME_DAY     SLE_TIME
## 3             -0.4348796            -0.4250043          0.19351286 -0.050096574
## 4             -0.2920426             0.5765997          0.28102106 -0.006145864
## 1             -0.2597760            -0.1763795         -0.53365711  0.006988034
## 2              1.8976305             0.2178487         -0.04050411  0.109717807
##   NUT_VEG_QTY NUT_FRUITS_QTY NUT_JUICE_QTY ALC_CUR_FREQ
## 3  -0.2473456     -0.2431350   -0.12233785   -0.3732542
## 4  -0.3657558     -0.3219000    0.12873015    0.1738716
## 1   0.8706258      0.8441120   -0.03233799    0.2058765
## 2  -0.2469552     -0.2893893    0.09758113    0.1797945
## 
## Clustering vector:
##     [1] 1 2 3 1 2 2 2 4 2 1 1 3 1 3 3 1 3 4 2 2 3 2 1 2 2 2 1 2 2 3 3 3 2 3 4 2
##    [37] 1 2 4 3 2 2 3 4 1 2 3 1 4 1 1 4 1 1 3 3 1 1 1 2 1 4 4 2 2 1 1 1 3 2 1 2
##    [73] 4 3 4 3 2 1 3 1 1 2 2 1 1 3 2 3 3 1 3 3 1 4 3 2 2 2 3 4 1 3 1 2 4 1 2 2
##   [109] 4 2 1 4 2 2 2 2 2 2 1 3 4 2 2 1 3 3 2 1 2 3 1 1 2 2 3 2 1 1 3 1 1 1 2 1
##   [145] 1 2 3 2 1 2 1 2 1 2 2 2 4 4 2 1 2 1 1 2 2 2 3 4 3 2 2 2 1 2 3 1 2 2 1 2
##   [181] 1 2 3 1 3 4 1 2 1 1 3 4 2 1 3 4 3 2 4 3 3 1 1 1 1 3 4 1 1 3 3 2 3 1 1 3
##   [217] 2 2 4 4 2 2 1 1 3 2 4 4 4 2 2 1 4 4 1 1 1 2 1 2 3 2 4 2 3 2 2 4 3 1 2 3
##   [253] 1 4 2 1 2 2 3 2 1 2 2 1 1 1 3 4 1 1 1 1 1 3 2 1 1 1 3 2 2 2 4 3 4 2 1 1
##   [289] 1 1 1 4 1 4 1 2 2 2 2 2 2 3 1 2 1 1 1 3 2 2 2 3 2 1 2 2 2 2 1 2 2 1 1 2
##   [325] 1 4 2 1 4 2 3 4 1 2 1 3 2 1 2 4 1 3 2 4 1 4 2 1 1 1 1 4 1 2 1 2 2 1 1 2
##   [361] 3 2 4 1 1 2 3 3 1 1 1 1 2 2 2 3 2 2 4 3 1 4 3 1 2 2 3 1 2 1 1 2 2 1 4 2
##   [397] 2 3 2 1 3 3 1 1 1 1 4 4 1 2 3 1 2 2 2 1 1 4 2 2 1 2 4 3 1 3 3 1 1 1 2 3
##   [433] 2 2 4 1 2 1 1 1 3 2 1 2 1 1 3 1 1 4 1 1 2 1 1 3 1 1 2 2 4 1 1 1 2 3 4 1
##   [469] 1 3 1 1 2 3 2 1 4 1 2 1 4 2 3 3 3 4 1 4 1 4 3 4 1 1 3 1 1 2 1 2 2 2 3 3
##   [505] 3 2 2 1 1 1 1 3 2 3 1 1 1 1 1 1 1 4 2 1 3 1 2 1 2 2 2 3 3 1 1 1 1 2 4 1
##   [541] 3 2 2 1 2 1 4 3 1 1 3 2 2 1 2 1 2 4 1 2 2 3 4 3 2 4 2 1 2 3 3 3 3 2 3 1
##   [577] 1 2 1 4 3 2 3 2 4 1 1 1 2 3 4 1 4 1 2 2 2 3 1 2 2 1 4 3 1 1 3 3 2 1 1 1
##   [613] 2 1 2 1 1 1 3 1 4 2 3 4 3 2 3 1 2 2 1 1 2 2 1 4 3 1 3 2 4 2 2 1 3 3 2 1
##   [649] 3 4 2 3 1 1 1 2 1 2 3 1 1 4 2 1 1 2 2 2 3 4 1 3 3 2 1 3 1 1 3 2 2 1 2 3
##   [685] 2 1 4 3 1 3 1 3 3 3 1 2 1 2 4 2 4 2 4 1 2 3 4 1 1 1 2 1 1 2 1 2 1 1 2 3
##   [721] 4 4 1 3 1 2 2 3 2 1 2 2 1 3 1 1 3 2 1 2 3 4 1 1 2 3 1 2 2 3 2 1 3 1 4 1
##   [757] 1 1 3 2 3 1 2 4 3 1 1 2 3 1 1 1 2 3 1 1 1 4 1 1 3 2 2 2 1 3 3 2 2 2 3 3
##   [793] 3 2 1 1 2 3 1 2 3 1 2 2 1 2 2 1 4 2 4 4 1 2 2 3 1 1 4 4 2 3 1 2 1 2 2 3
##   [829] 2 1 3 3 2 1 1 2 1 3 2 4 3 1 2 1 4 2 3 1 2 2 3 2 1 1 3 4 3 1 2 2 2 4 1 1
##   [865] 3 2 2 2 2 2 1 1 2 3 2 3 3 2 3 2 2 2 2 2 3 2 2 3 2 1 4 2 3 2 1 3 4 4 1 3
##   [901] 2 1 2 2 4 1 3 3 4 4 2 3 1 2 3 3 1 3 1 1 2 1 2 2 1 4 3 2 2 4 1 3 3 1 3 3
##   [937] 1 2 1 3 1 2 3 2 1 2 2 3 1 1 1 2 2 4 3 2 2 3 2 1 4 1 1 1 2 1 3 2 4 4 1 1
##   [973] 3 2 1 2 4 1 3 2 4 2 4 1 1 2 1 3 1 2 2 4 4 4 3 3 1 1 2 3 2 3 1 3 1 3 1 2
##  [1009] 2 2 1 1 2 2 3 1 1 1 2 1 1 2 1 4 3 1 2 1 1 1 2 1 3 4 1 2 4 2 1 3 2 4 1 4
##  [1045] 1 3 2 2 3 2 2 3 3 1 1 1 3 3 3 2 2 1 1 1 2 3 2 4 2 3 3 1 4 2 1 1 1 2 1 1
## 
## ...
## and 347 more lines.

Save the cluster variable

clusters <- kmeans_fit %>% extract_cluster_assignment()
clusters <- as.data.frame(clusters)

names(clusters) <- c("cluster")
data$clusters <- clusters$cluster


summary(data$clusters)
## Cluster_1 Cluster_2 Cluster_3 Cluster_4 
##      4423      3512      3301      2006

Visualize the clusters

data %>%
    dplyr::select(c("SDC_AGE_CALC", "PA_TOTAL_SHORT", "PM_BMI_SR", "SDC_EDU_LEVEL_AGE", "PSE_ADULT_WRK_DURATION", "PA_SIT_AVG_TIME_DAY", "ALC_CUR_FREQ", "clusters")) %>%
    ggpairs(aes(fill = clusters, color = clusters))

Visualize centroid of each cluster for each variable

centroids <- extract_centroids(kmeans_fit)

centroids_long <- centroids %>% pivot_longer(cols=c("SDC_AGE_CALC", "PA_TOTAL_SHORT", "PM_BMI_SR", "SDC_EDU_LEVEL_AGE", "PSE_ADULT_WRK_DURATION", "PM_WAIST_HIP_RATIO_SR", "PA_SIT_AVG_TIME_DAY", "SLE_TIME", "NUT_VEG_QTY", "NUT_FRUITS_QTY", "NUT_JUICE_QTY", "ALC_CUR_FREQ"), names_to = "name", values_to = "value")

ggplot(data = centroids_long, aes(x = name, y = value, group = .cluster, color = .cluster)) +
    geom_point() +
    geom_line() +
    labs(x="", y="Value at cluster center") + 
  theme(axis.text.x = element_text(angle=45, hjust = 1))

Interpretation: The plot of the cluster centroids shows that there are distinct patterns in the values of the variables for each cluster. For example, Cluster 1 has higher values for SDC_AGE_CALC and PA_TOTAL_SHORT, while Cluster 2 has higher values for PM_BMI_SR and SDC_EDU_LEVEL_AGE. Cluster 3 has higher values for PSE_ADULT_WRK_DURATION and PA_SIT_AVG_TIME_DAY, while Cluster 4 has higher values for ALC_CUR_FREQ. These patterns suggest that the clusters may represent different subgroups of individuals with varying characteristics related to age, physical activity, BMI, education level, work duration, sitting time, and alcohol consumption. Further analysis may be needed to interpret the meaning of these clusters in the context of the data and research question.

Model with tuning of cluster

kmeans_model_tune <- k_means(num_clusters = tune()) %>%
                  set_engine("stats")

Workflowith tuning of cluster

kmodes_workflow_tune <- workflow() %>%
    add_recipe(kmeans_recipe) %>%
    add_model(kmeans_model_tune)

folds <- vfold_cv(data, v = 2)
grid <- tibble(num_clusters=1:10)
tuned_model <- tune_cluster(kmodes_workflow_tune, 
                        resamples = folds, 
                        grid = grid,
                       metrics = cluster_metric_set(silhouette_avg), 
                       control = control_resamples(save_pred = TRUE, 
                                                  verbose = TRUE, 
                                                  parallel_over = "everything")
                       )
collect_metrics(tuned_model) %>% head()
## # A tibble: 6 × 7
##   num_clusters .metric        .estimator     mean     n   std_err .config       
##          <int> <chr>          <chr>         <dbl> <int>     <dbl> <chr>         
## 1            1 silhouette_avg standard   NaN          0 NA        Preprocessor1…
## 2            2 silhouette_avg standard     0.0956     2  0.00225  Preprocessor1…
## 3            3 silhouette_avg standard     0.0891     2  0.00469  Preprocessor1…
## 4            4 silhouette_avg standard     0.0921     2  0.00353  Preprocessor1…
## 5            5 silhouette_avg standard     0.0865     2  0.00669  Preprocessor1…
## 6            6 silhouette_avg standard     0.0728     2  0.000896 Preprocessor1…
autoplot(tuned_model)

Interpretation: The silhouette method was used to evaluate clustering performance across different values of k. The highest average silhouette width was observed at k = 2 (approximately 0.096), indicating that two clusters provide the best separation among the tested configurations. Although k = 4 also shows relatively strong performance. For this assignment, k = 4 was selected as the optimal number of clusters. However, the overall silhouette values are relatively low, suggesting modest cluster separation in the data.

Cluster tuning with 4 clusters

data2 <- data %>% dplyr::select(-clusters)

Model

set.seed(10)

kmeans_final_model <- k_means(num_clusters = 4) %>%
                  set_engine("stats")
kmeans_final_model
## K Means Cluster Specification (partition)
## 
## Main Arguments:
##   num_clusters = 4
## 
## Computational engine: stats

Recipe

kmeans_recipe_final <- recipe(~ ., data = data2) |>
  update_role(ID, new_role = "id") |>
  step_zv(all_predictors()) |>
  step_center(all_numeric_predictors()) |>
  step_scale(all_numeric_predictors())
kmeans_recipe_final

Workflow

set.seed(10)

kmeans_workflow_final <- workflow() |>
    add_recipe(kmeans_recipe_final) |>
    add_model(kmeans_final_model)

Fit the model

set.seed(10)

kmeans_fit_final <- kmeans_workflow_final |> fit(data=data2)

kmeans_fit_final
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: k_means()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_zv()
## • step_center()
## • step_scale()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## K-means clustering with 4 clusters of sizes 4423, 3512, 3301, 2006
## 
## Cluster means:
##   SDC_AGE_CALC PA_TOTAL_SHORT  PM_BMI_SR SDC_EDU_LEVEL_AGE
## 3   -0.7272835     -0.2579418 -0.4554022       -0.19251605
## 4    0.2663898     -0.3231741  0.7748929       -0.01345518
## 1    0.2095763      0.5756346 -0.2792562        0.32719500
## 2    0.7923245      0.1872851  0.1070014       -0.09038765
##   PSE_ADULT_WRK_DURATION PM_WAIST_HIP_RATIO_SR PA_SIT_AVG_TIME_DAY     SLE_TIME
## 3             -0.4348796            -0.4250043          0.19351286 -0.050096574
## 4             -0.2920426             0.5765997          0.28102106 -0.006145864
## 1             -0.2597760            -0.1763795         -0.53365711  0.006988034
## 2              1.8976305             0.2178487         -0.04050411  0.109717807
##   NUT_VEG_QTY NUT_FRUITS_QTY NUT_JUICE_QTY ALC_CUR_FREQ
## 3  -0.2473456     -0.2431350   -0.12233785   -0.3732542
## 4  -0.3657558     -0.3219000    0.12873015    0.1738716
## 1   0.8706258      0.8441120   -0.03233799    0.2058765
## 2  -0.2469552     -0.2893893    0.09758113    0.1797945
## 
## Clustering vector:
##     [1] 1 2 3 1 2 2 2 4 2 1 1 3 1 3 3 1 3 4 2 2 3 2 1 2 2 2 1 2 2 3 3 3 2 3 4 2
##    [37] 1 2 4 3 2 2 3 4 1 2 3 1 4 1 1 4 1 1 3 3 1 1 1 2 1 4 4 2 2 1 1 1 3 2 1 2
##    [73] 4 3 4 3 2 1 3 1 1 2 2 1 1 3 2 3 3 1 3 3 1 4 3 2 2 2 3 4 1 3 1 2 4 1 2 2
##   [109] 4 2 1 4 2 2 2 2 2 2 1 3 4 2 2 1 3 3 2 1 2 3 1 1 2 2 3 2 1 1 3 1 1 1 2 1
##   [145] 1 2 3 2 1 2 1 2 1 2 2 2 4 4 2 1 2 1 1 2 2 2 3 4 3 2 2 2 1 2 3 1 2 2 1 2
##   [181] 1 2 3 1 3 4 1 2 1 1 3 4 2 1 3 4 3 2 4 3 3 1 1 1 1 3 4 1 1 3 3 2 3 1 1 3
##   [217] 2 2 4 4 2 2 1 1 3 2 4 4 4 2 2 1 4 4 1 1 1 2 1 2 3 2 4 2 3 2 2 4 3 1 2 3
##   [253] 1 4 2 1 2 2 3 2 1 2 2 1 1 1 3 4 1 1 1 1 1 3 2 1 1 1 3 2 2 2 4 3 4 2 1 1
##   [289] 1 1 1 4 1 4 1 2 2 2 2 2 2 3 1 2 1 1 1 3 2 2 2 3 2 1 2 2 2 2 1 2 2 1 1 2
##   [325] 1 4 2 1 4 2 3 4 1 2 1 3 2 1 2 4 1 3 2 4 1 4 2 1 1 1 1 4 1 2 1 2 2 1 1 2
##   [361] 3 2 4 1 1 2 3 3 1 1 1 1 2 2 2 3 2 2 4 3 1 4 3 1 2 2 3 1 2 1 1 2 2 1 4 2
##   [397] 2 3 2 1 3 3 1 1 1 1 4 4 1 2 3 1 2 2 2 1 1 4 2 2 1 2 4 3 1 3 3 1 1 1 2 3
##   [433] 2 2 4 1 2 1 1 1 3 2 1 2 1 1 3 1 1 4 1 1 2 1 1 3 1 1 2 2 4 1 1 1 2 3 4 1
##   [469] 1 3 1 1 2 3 2 1 4 1 2 1 4 2 3 3 3 4 1 4 1 4 3 4 1 1 3 1 1 2 1 2 2 2 3 3
##   [505] 3 2 2 1 1 1 1 3 2 3 1 1 1 1 1 1 1 4 2 1 3 1 2 1 2 2 2 3 3 1 1 1 1 2 4 1
##   [541] 3 2 2 1 2 1 4 3 1 1 3 2 2 1 2 1 2 4 1 2 2 3 4 3 2 4 2 1 2 3 3 3 3 2 3 1
##   [577] 1 2 1 4 3 2 3 2 4 1 1 1 2 3 4 1 4 1 2 2 2 3 1 2 2 1 4 3 1 1 3 3 2 1 1 1
##   [613] 2 1 2 1 1 1 3 1 4 2 3 4 3 2 3 1 2 2 1 1 2 2 1 4 3 1 3 2 4 2 2 1 3 3 2 1
##   [649] 3 4 2 3 1 1 1 2 1 2 3 1 1 4 2 1 1 2 2 2 3 4 1 3 3 2 1 3 1 1 3 2 2 1 2 3
##   [685] 2 1 4 3 1 3 1 3 3 3 1 2 1 2 4 2 4 2 4 1 2 3 4 1 1 1 2 1 1 2 1 2 1 1 2 3
##   [721] 4 4 1 3 1 2 2 3 2 1 2 2 1 3 1 1 3 2 1 2 3 4 1 1 2 3 1 2 2 3 2 1 3 1 4 1
##   [757] 1 1 3 2 3 1 2 4 3 1 1 2 3 1 1 1 2 3 1 1 1 4 1 1 3 2 2 2 1 3 3 2 2 2 3 3
##   [793] 3 2 1 1 2 3 1 2 3 1 2 2 1 2 2 1 4 2 4 4 1 2 2 3 1 1 4 4 2 3 1 2 1 2 2 3
##   [829] 2 1 3 3 2 1 1 2 1 3 2 4 3 1 2 1 4 2 3 1 2 2 3 2 1 1 3 4 3 1 2 2 2 4 1 1
##   [865] 3 2 2 2 2 2 1 1 2 3 2 3 3 2 3 2 2 2 2 2 3 2 2 3 2 1 4 2 3 2 1 3 4 4 1 3
##   [901] 2 1 2 2 4 1 3 3 4 4 2 3 1 2 3 3 1 3 1 1 2 1 2 2 1 4 3 2 2 4 1 3 3 1 3 3
##   [937] 1 2 1 3 1 2 3 2 1 2 2 3 1 1 1 2 2 4 3 2 2 3 2 1 4 1 1 1 2 1 3 2 4 4 1 1
##   [973] 3 2 1 2 4 1 3 2 4 2 4 1 1 2 1 3 1 2 2 4 4 4 3 3 1 1 2 3 2 3 1 3 1 3 1 2
##  [1009] 2 2 1 1 2 2 3 1 1 1 2 1 1 2 1 4 3 1 2 1 1 1 2 1 3 4 1 2 4 2 1 3 2 4 1 4
##  [1045] 1 3 2 2 3 2 2 3 3 1 1 1 3 3 3 2 2 1 1 1 2 3 2 4 2 3 3 1 4 2 1 1 1 2 1 1
## 
## ...
## and 347 more lines.

save the cluster

clusters <-kmeans_fit_final |> extract_cluster_assignment()
clusters <- as.data.frame(clusters)

names(clusters) <- c("cluster")
data2$clusters <- clusters$cluster

Workflow

kmodel_workflow_tune <- workflow() %>%
    add_recipe(kmeans_recipe_final) %>%
    add_model(kmeans_model_tune)

folds <- vfold_cv(data, v = 2)
grid <- tibble(num_clusters=1:10)
tuned_model <- tune_cluster(kmodes_workflow_tune, 
                        resamples = folds, 
                        grid = grid,
                       metrics = cluster_metric_set(silhouette_avg), 
                       control = control_resamples(save_pred = TRUE, 
                                                  verbose = TRUE, 
                                                  parallel_over = "everything")
                       )
collect_metrics(tuned_model) %>% head()
## # A tibble: 6 × 7
##   num_clusters .metric        .estimator     mean     n   std_err .config       
##          <int> <chr>          <chr>         <dbl> <int>     <dbl> <chr>         
## 1            1 silhouette_avg standard   NaN          0 NA        Preprocessor1…
## 2            2 silhouette_avg standard     0.0956     2  0.00225  Preprocessor1…
## 3            3 silhouette_avg standard     0.0891     2  0.00469  Preprocessor1…
## 4            4 silhouette_avg standard     0.0921     2  0.00353  Preprocessor1…
## 5            5 silhouette_avg standard     0.0865     2  0.00669  Preprocessor1…
## 6            6 silhouette_avg standard     0.0728     2  0.000896 Preprocessor1…

Question 4. PCA Regression Model Comparisons

Linear regression based on Baseline PCA Model

Data Split

set.seed(10)

data_split <- initial_split(data, prop = 0.70)

# Create data frames for the two sets:
train_data <- training(data_split)
summary(train_data$PM_BMI_SR)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.86   23.37   26.45   27.32   30.13   67.31
test_data  <- testing(data_split)
summary(test_data$PM_BMI_SR)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.86   23.40   26.46   27.37   30.16   69.40

Model

linear_model <- linear_reg() %>%
        set_engine("glm") %>%
        set_mode("regression") 

linear_model
## Linear Regression Model Specification (regression)
## 
## Computational engine: glm

Recipe

pca_baseline_recipe <- recipe(PM_BMI_SR ~., data = train_data) %>%
  update_role(ID, new_role = "id") %>%
  step_scale(all_numeric_predictors()) %>%
  step_center(all_numeric_predictors()) %>%
  step_pca(all_numeric_predictors(), num_comp =7, id = "pca_id") |>
  step_zv(all_predictors()) 

pca_baseline_recipe

Workflow

baseline_bmi_workflow <- 
        workflow() %>%
        add_model(linear_model) %>% 
        add_recipe(pca_baseline_recipe)

baseline_bmi_workflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 4 Recipe Steps
## 
## • step_scale()
## • step_center()
## • step_pca()
## • step_zv()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
## 
## Computational engine: glm

Fit a model

baseline_bmi_fit <- 
  baseline_bmi_workflow %>% 
  fit(data = train_data)
options(scipen = 999, digits = 3)

baseline_bmi_fit_extract <- baseline_bmi_fit %>% 
                    extract_fit_parsnip() %>% 
                    tidy()
baseline_bmi_fit_extract
## # A tibble: 11 × 5
##    term              estimate std.error statistic  p.value
##    <chr>                <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)         24.1      0.115     211.   0       
##  2 clustersCluster_2    7.94     0.157      50.6  0       
##  3 clustersCluster_3    1.40     0.194       7.24 4.70e-13
##  4 clustersCluster_4    4.63     0.249      18.6  1.45e-75
##  5 PC1                 -0.538    0.0608     -8.85 9.99e-19
##  6 PC2                 -0.168    0.0627     -2.68 7.42e- 3
##  7 PC3                 -0.238    0.0487     -4.88 1.08e- 6
##  8 PC4                  0.108    0.0502      2.15 3.13e- 2
##  9 PC5                  0.571    0.0535     10.7  2.33e-26
## 10 PC6                 -0.117    0.0528     -2.22 2.65e- 2
## 11 PC7                  0.572    0.0523     10.9  1.12e-27

Predict on test data

baseline_bmi_predicted <- augment(baseline_bmi_fit, test_data)

ggplot(baseline_bmi_predicted, aes(x = PM_BMI_SR, y = .pred)) +
  geom_point(alpha = 0.1) +
  geom_smooth(method = "lm") +
  labs(x = "Measured BMI", y = "Predicted BMI") +
  theme_minimal()

Quick check of the predictions

pred_true <- test_data |>
  dplyr::select(PM_BMI_SR) |>
  bind_cols(baseline_bmi_predicted)

head(pred_true)
## # A tibble: 6 × 17
##   PM_BMI_SR...1 .pred .resid ID        SDC_AGE_CALC PA_TOTAL_SHORT PM_BMI_SR...7
##           <dbl> <dbl>  <dbl> <chr>            <dbl>          <dbl>         <dbl>
## 1          22.5  24.8 -2.32  SYN_58624           58           594           22.5
## 2          32.2  25.2  7.00  SYN_5862…           55          5332.          32.2
## 3          39.5  25.7 13.8   SYN_5862…           30            66           39.5
## 4          28.3  32.8 -4.47  SYN_5862…           46            99           28.3
## 5          30.6  31.6 -0.972 SYN_5862…           62          4788           30.6
## 6          47.7  30.9 16.8   SYN_5862…           60           480           47.7
## # ℹ 10 more variables: SDC_EDU_LEVEL_AGE <dbl>, PSE_ADULT_WRK_DURATION <dbl>,
## #   PM_WAIST_HIP_RATIO_SR <dbl>, PA_SIT_AVG_TIME_DAY <dbl>, SLE_TIME <dbl>,
## #   NUT_VEG_QTY <dbl>, NUT_FRUITS_QTY <dbl>, NUT_JUICE_QTY <dbl>,
## #   ALC_CUR_FREQ <dbl>, clusters <fct>

Linear regression with tuning of cluster

set.seed(10)

# Create a split object
data_split2 <- initial_split(data2, prop = 0.70)

# Build training and testing data set
train_data2 <- training(data_split2)
test_data2  <- testing(data_split2)

summary(train_data2$PM_BMI_SR)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.86   23.37   26.45   27.32   30.13   67.31
summary(test_data2$PM_BMI_SR)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.86   23.40   26.46   27.37   30.16   69.40
table(train_data2$clusters)
## 
## Cluster_1 Cluster_2 Cluster_3 Cluster_4 
##      3097      2477      2286      1409
table(test_data2$clusters)
## 
## Cluster_1 Cluster_2 Cluster_3 Cluster_4 
##      1326      1035      1015       597

Model

linear_model2 <- linear_reg() |>
        set_engine("glm") |>
        set_mode("regression")

Recipe

cluster_reg_recipe <- recipe(PM_BMI_SR ~ clusters, data = train_data2) %>%
  
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>% 
  step_zv(all_predictors())

cluster_reg_recipe

Workflow

# Workflow
bmi_workflow_2 <- 
        workflow() %>%
        add_model(linear_model2) %>% 
        add_recipe(cluster_reg_recipe)
bmi_workflow_2
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_dummy()
## • step_zv()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
## 
## Computational engine: glm

Fit the model

bmi_fit_2 <- 
  bmi_workflow_2 |>
  fit(data = train_data2)
options(scipen = 999, digits = 3)

bmi_fit_extract_2 <- bmi_fit_2 |>
                    extract_fit_parsnip() |>
                    tidy()
bmi_fit_extract_2
## # A tibble: 5 × 5
##   term               estimate std.error statistic    p.value
##   <chr>                 <dbl>     <dbl>     <dbl>      <dbl>
## 1 (Intercept)           28.0      0.134     210.   0        
## 2 clusters_Cluster_1    -3.43     0.161     -21.3  3.71e- 98
## 3 clusters_Cluster_2     3.73     0.167      22.3  4.28e-107
## 4 clusters_Cluster_3    -2.34     0.170     -13.8  1.29e- 42
## 5 clusters_Cluster_4    NA       NA          NA   NA

Predictions

bmi_predicted_2 <- augment(bmi_fit_2, test_data2)

bmi_predicted_2
## # A tibble: 3,973 × 16
##    .pred .resid ID       SDC_AGE_CALC PA_TOTAL_SHORT PM_BMI_SR SDC_EDU_LEVEL_AGE
##    <dbl>  <dbl> <chr>           <dbl>          <dbl>     <dbl>             <dbl>
##  1  24.6  -2.16 SYN_586…           58           594       22.5                17
##  2  25.7   6.47 SYN_586…           55          5332.      32.2                28
##  3  24.6  14.9  SYN_586…           30            66       39.5                26
##  4  31.8  -3.45 SYN_586…           46            99       28.3                22
##  5  31.8  -1.18 SYN_586…           62          4788       30.6                20
##  6  31.8  16.0  SYN_586…           60           480       47.7                18
##  7  31.8  20.6  SYN_586…           48             0       52.4                19
##  8  25.7   1.75 SYN_586…           31          4932       27.5                18
##  9  25.7   4.75 SYN_586…           69          9198       30.5                45
## 10  25.7   1.63 SYN_586…           66         10430       27.3                25
## # ℹ 3,963 more rows
## # ℹ 9 more variables: PSE_ADULT_WRK_DURATION <dbl>,
## #   PM_WAIST_HIP_RATIO_SR <dbl>, PA_SIT_AVG_TIME_DAY <dbl>, SLE_TIME <dbl>,
## #   NUT_VEG_QTY <dbl>, NUT_FRUITS_QTY <dbl>, NUT_JUICE_QTY <dbl>,
## #   ALC_CUR_FREQ <dbl>, clusters <fct>

Quick check of the predictions

pred_true_2 <- test_data2 |>
  dplyr::select(PM_BMI_SR) |>
  bind_cols(bmi_predicted_2)

head(pred_true_2)
## # A tibble: 6 × 17
##   PM_BMI_SR...1 .pred .resid ID        SDC_AGE_CALC PA_TOTAL_SHORT PM_BMI_SR...7
##           <dbl> <dbl>  <dbl> <chr>            <dbl>          <dbl>         <dbl>
## 1          22.5  24.6  -2.16 SYN_58624           58           594           22.5
## 2          32.2  25.7   6.47 SYN_5862…           55          5332.          32.2
## 3          39.5  24.6  14.9  SYN_5862…           30            66           39.5
## 4          28.3  31.8  -3.45 SYN_5862…           46            99           28.3
## 5          30.6  31.8  -1.18 SYN_5862…           62          4788           30.6
## 6          47.7  31.8  16.0  SYN_5862…           60           480           47.7
## # ℹ 10 more variables: SDC_EDU_LEVEL_AGE <dbl>, PSE_ADULT_WRK_DURATION <dbl>,
## #   PM_WAIST_HIP_RATIO_SR <dbl>, PA_SIT_AVG_TIME_DAY <dbl>, SLE_TIME <dbl>,
## #   NUT_VEG_QTY <dbl>, NUT_FRUITS_QTY <dbl>, NUT_JUICE_QTY <dbl>,
## #   ALC_CUR_FREQ <dbl>, clusters <fct>

Comparison

baseline_metrics <- baseline_bmi_predicted %>%
  metrics(truth = PM_BMI_SR, estimate = .pred)

tuned_metrics <- bmi_predicted_2 %>%
  metrics(truth = PM_BMI_SR, estimate = .pred)

baseline_metrics
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       5.01 
## 2 rsq     standard       0.277
## 3 mae     standard       3.74
tuned_metrics
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       5.12 
## 2 rsq     standard       0.245
## 3 mae     standard       3.82

Interpretation: The baseline PCA regression model has an RMSE of approximately 5.01, while the linear regression model with the cluster variable has an RMSE of approximately 5.12. This suggests that the baseline model with the cluster variable has a slightly better predictive performance compared to the tuned PCA model. However, the difference in RMSE is relatively small, indicating that both models have similar predictive accuracy for BMI in this dataset. Further analysis may be needed to determine if the improvement in performance is statistically significant and to explore other potential factors that could influence BMI predictions.

Features importance

coeff <- tidy(bmi_fit_2) %>% 
  arrange(desc(abs(estimate))) %>% 
  filter(abs(estimate) > 0.5)


knitr::kable(coeff)
term estimate std.error statistic p.value
(Intercept) 28.05 0.134 209.8 0
clusters_Cluster_2 3.73 0.167 22.3 0
clusters_Cluster_1 -3.43 0.161 -21.3 0
clusters_Cluster_3 -2.34 0.170 -13.8 0

Plot of feature importance

ggplot(coeff, aes(x = term, y = estimate, fill = term)) + geom_col() + coord_flip()

Interpretation: The feature importance plot shows that the cluster variable has a significant impact on the predicted BMI.Cluster 2 appears to represent the highest BMI group. Clusters 1 and 3 have lower BMI than the reference group.

sessionInfo()
## R version 4.5.2 (2025-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Regina
## tzcode source: internal
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] knitr_1.51         GGally_2.4.0       tidyclust_0.2.4    see_0.12.0        
##  [5] performance_0.15.3 corrr_0.4.5        themis_1.0.3       glmnet_4.1-10     
##  [9] Matrix_1.7-4       vip_0.4.5          mlbench_2.1-6      gtsummary_2.5.0   
## [13] finalfit_1.1.0     psych_2.5.6        plotly_4.12.0      sjPlot_2.9.0      
## [17] reportRmd_0.1.1    yardstick_1.3.2    workflowsets_1.1.1 workflows_1.3.0   
## [21] tune_2.0.1         tailor_0.1.0       rsample_1.3.1      recipes_1.3.1     
## [25] parsnip_1.4.1      modeldata_1.5.1    infer_1.1.0        dials_1.4.2       
## [29] scales_1.4.0       broom_1.0.11       tidymodels_1.4.1   lubridate_1.9.4   
## [33] forcats_1.0.1      stringr_1.6.0      dplyr_1.1.4        purrr_1.2.0       
## [37] readr_2.1.6        tidyr_1.3.2        tibble_3.3.0       ggplot2_4.0.1     
## [41] tidyverse_2.0.0   
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.5.2         hardhat_1.4.2         rpart_4.1.24         
##   [4] sparsevctrs_0.3.5     lifecycle_1.0.4       Rdpack_2.6.4         
##   [7] rstatix_0.7.3         globals_0.18.0        lattice_0.22-7       
##  [10] vroom_1.6.7           MASS_7.3-65           insight_1.4.4        
##  [13] crosstalk_1.2.2       backports_1.5.0       magrittr_2.0.4       
##  [16] sass_0.4.10           rmarkdown_2.30        jquerylib_0.1.4      
##  [19] yaml_2.3.12           otel_0.2.0            cowplot_1.2.0        
##  [22] minqa_1.2.8           RColorBrewer_1.1-3    abind_1.4-8          
##  [25] nnet_7.3-20           ipred_0.9-15          lava_1.8.2           
##  [28] seriation_1.5.8       listenv_0.10.0        parallelly_1.46.1    
##  [31] svglite_2.2.2         codetools_0.2-20      xml2_1.5.1           
##  [34] tidyselect_1.2.1      shape_1.4.6.1         farver_2.1.2         
##  [37] lme4_1.1-38           TSP_1.2.6             stats4_4.5.2         
##  [40] jsonlite_2.0.0        mitml_0.4-5           Formula_1.2-5        
##  [43] survival_3.8-3        iterators_1.0.14      systemfonts_1.3.1    
##  [46] foreach_1.5.2         tools_4.5.2           cmprsk_2.2-12        
##  [49] Rcpp_1.1.0            glue_1.8.0            mnormt_2.1.1         
##  [52] prodlim_2025.04.28    gridExtra_2.3         pan_1.9              
##  [55] xfun_0.55             mgcv_1.9-3            flexclust_1.5.0      
##  [58] ca_0.71.1             withr_3.0.2           ROSE_0.0-4           
##  [61] fastmap_1.2.0         boot_1.3-32           digest_0.6.39        
##  [64] timechange_0.3.0      R6_2.6.1              mice_3.19.0          
##  [67] textshaping_1.0.4     utf8_1.2.6            generics_0.1.4       
##  [70] data.table_1.18.0     class_7.3-23          httr_1.4.7           
##  [73] htmlwidgets_1.6.4     ggstats_0.12.0        pkgconfig_2.0.3      
##  [76] gtable_0.3.6          timeDate_4051.111     modeltools_0.2-24    
##  [79] registry_0.5-1        GPfit_1.0-9           S7_0.2.1             
##  [82] furrr_0.3.1           htmltools_0.5.9       carData_3.0-5        
##  [85] kableExtra_1.4.0      gower_1.0.2           reformulas_0.4.3.1   
##  [88] rstudioapi_0.17.1     tzdb_0.5.0            nlme_3.1-168         
##  [91] nloptr_2.2.1          cachem_1.1.0          pillar_1.11.1        
##  [94] grid_4.5.2            vctrs_0.6.5           ggpubr_0.6.2         
##  [97] car_3.1-3             jomo_2.7-6            cluster_2.1.8.1      
## [100] lhs_1.2.0             evaluate_1.0.5        cli_3.6.5            
## [103] compiler_4.5.2        rlang_1.1.6           crayon_1.5.3         
## [106] future.apply_1.20.1   ggsignif_0.6.4        labeling_0.4.3       
## [109] stringi_1.8.7         modelenv_0.2.0        philentropy_0.10.0   
## [112] viridisLite_0.4.2     lazyeval_0.2.2        geepack_1.3.13       
## [115] aod_1.3.3             hms_1.1.4             bit64_4.6.0-1        
## [118] future_1.69.0         rbibutils_2.4         RcppParallel_5.1.11-1
## [121] bslib_0.9.0           bit_4.6.0             DiceDesign_1.10